Introduction

This document contains the exploratory data analysis (EDA) for the Airbnb Berlin rental demand prediction project. The goal is to gain insights from the data and identify patterns that will inform our predictive modeling approach.

Data Loading and Column Name Exploration

First, we need to load the dataset from the data directory and inspect the column names to ensure consistency.

# Set the file path to the existing data
data_path <- "../data/raw/"

# Load the train and test datasets
train_data <- read.csv(paste0(data_path, "train_airbnb_berlin.csv"))
test_data <- read.csv(paste0(data_path, "test_airbnb_berlin.csv"))

# Display basic information about the datasets
cat("Train dataset dimensions:", dim(train_data)[1], "rows,", dim(train_data)[2], "columns\n")
## Train dataset dimensions: 15692 rows, 39 columns
cat("Test dataset dimensions:", dim(test_data)[1], "rows,", dim(test_data)[2], "columns\n")
## Test dataset dimensions: 7842 rows, 38 columns
# Examine column names
cat("\nOriginal train dataset column names:\n")
## 
## Original train dataset column names:
print(colnames(train_data))
##  [1] "Listing.ID"            "Listing.Name"          "Host.ID"              
##  [4] "Host.Name"             "Host.Since"            "Host.Response.Time"   
##  [7] "Host.Response.Rate"    "Is.Superhost"          "neighbourhood"        
## [10] "Neighborhood.Group"    "City"                  "Postal.Code"          
## [13] "Country.Code"          "Country"               "Latitude"             
## [16] "Longitude"             "Is.Exact.Location"     "Property.Type"        
## [19] "Room.Type"             "Accomodates"           "Bathrooms"            
## [22] "Bedrooms"              "Beds"                  "Square.Feet"          
## [25] "Guests.Included"       "Min.Nights"            "Reviews"              
## [28] "First.Review"          "Last.Review"           "Overall.Rating"       
## [31] "Accuracy.Rating"       "Cleanliness.Rating"    "Checkin.Rating"       
## [34] "Communication.Rating"  "Location.Rating"       "Value.Rating"         
## [37] "Instant.Bookable"      "Business.Travel.Ready" "Price"
# Clean column names to lowercase
train_data <- train_data %>% 
  janitor::clean_names()

test_data <- test_data %>%
  janitor::clean_names()

# Examine cleaned column names
cat("\nCleaned train dataset column names (lowercase):\n")
## 
## Cleaned train dataset column names (lowercase):
print(colnames(train_data))
##  [1] "listing_id"            "listing_name"          "host_id"              
##  [4] "host_name"             "host_since"            "host_response_time"   
##  [7] "host_response_rate"    "is_superhost"          "neighbourhood"        
## [10] "neighborhood_group"    "city"                  "postal_code"          
## [13] "country_code"          "country"               "latitude"             
## [16] "longitude"             "is_exact_location"     "property_type"        
## [19] "room_type"             "accomodates"           "bathrooms"            
## [22] "bedrooms"              "beds"                  "square_feet"          
## [25] "guests_included"       "min_nights"            "reviews"              
## [28] "first_review"          "last_review"           "overall_rating"       
## [31] "accuracy_rating"       "cleanliness_rating"    "checkin_rating"       
## [34] "communication_rating"  "location_rating"       "value_rating"         
## [37] "instant_bookable"      "business_travel_ready" "price"
# Preview train data
kable(head(train_data))
listing_id listing_name host_id host_name host_since host_response_time host_response_rate is_superhost neighbourhood neighborhood_group city postal_code country_code country latitude longitude is_exact_location property_type room_type accomodates bathrooms bedrooms beds square_feet guests_included min_nights reviews first_review last_review overall_rating accuracy_rating cleanliness_rating checkin_rating communication_rating location_rating value_rating instant_bookable business_travel_ready price
19665213 * 156079597 Maximilian 2016-01-20 f Prenzlauer Berg Pankow Berlin 10437.0 DE Germany 52.54652 13.41792 t Apartment Private room 2 1.0 1.0 1.0 NA 1 2 6 2017-07-07 2017-08-08 100 10 10 10 10 9 10 t f 26
6436842 * 5302290 Dulie 2013-04-07 f Pankow Pankow Berlin 13187.0 DE Germany 52.56512 13.42214 t Apartment Entire home/apt 2 1.0 2.0 2.0 NA 2 7 6 2015-05-26 2019-04-30 90 9 9 10 10 9 10 f f 41
10559468 * 59151456 Geank 2016-02-07 f Prenzlauer Berg Pankow Berlin 10439.0 DE Germany 52.54741 13.42521 t Apartment Entire home/apt 3 1.0 1.0 2.0 NA 1 1 2 2016-04-19 2016-07-04 100 10 10 10 10 10 10 f f 50
27215482 * 193452785 Alix 2018-06-26 f Friedrichshain Friedrichshain-Kreuzberg Berlin 10245 DE Germany 52.50958 13.45144 t Apartment Private room 2 1.0 1.0 1.0 NA 1 2 4 2018-07-31 2018-08-12 100 10 10 10 10 10 9 f f 50
27287546 * 205870244 Lurina 2013-05-16 within a few hours 92% t Prenzlauer Berg Pankow Berlin 10405.0 DE Germany 52.52995 13.41558 t Apartment Private room 3 1.0 1.0 2.0 NA 1 6 0 NA NA NA NA NA NA NA t f 55
26590915 * 90250336 Zan 2016-08-22 within an hour 100% t Mariendorf Tempelhof - Schöneberg Berlin * DE Germany 52.44826 13.40608 t Condominium Private room 3 1.0 1.0 2.0 NA 2 1 10 2018-09-25 2019-05-03 99 10 9 10 10 10 10 t f 39

Results: The dataset contains information on Berlin Airbnb listings. We’ve first examined the original column names and then standardized them to lowercase with snake_case formatting using the janitor::clean_names() function. This ensures consistency across our analysis. The training dataset has over 15,000 rows and 39 columns, while the test dataset has about 7,800 rows.

Data Structure and Types

Let’s examine the structure and summary statistics of our data.

# Check data types for each column
str(train_data)
## 'data.frame':    15692 obs. of  39 variables:
##  $ listing_id           : num  19665213 6436842 10559468 27215482 27287546 ...
##  $ listing_name         : chr  "*" "*" "*" "*" ...
##  $ host_id              : num  1.56e+08 5.30e+06 5.92e+07 1.93e+08 2.06e+08 ...
##  $ host_name            : chr  "Maximilian" "Dulie" "Geank" "Alix" ...
##  $ host_since           : chr  "2016-01-20" "2013-04-07" "2016-02-07" "2018-06-26" ...
##  $ host_response_time   : chr  "" "" "" "" ...
##  $ host_response_rate   : chr  "" "" "" "" ...
##  $ is_superhost         : chr  "f" "f" "f" "f" ...
##  $ neighbourhood        : chr  "Prenzlauer Berg" "Pankow" "Prenzlauer Berg" "Friedrichshain" ...
##  $ neighborhood_group   : chr  "Pankow" "Pankow" "Pankow" "Friedrichshain-Kreuzberg" ...
##  $ city                 : chr  "Berlin" "Berlin" "Berlin" "Berlin" ...
##  $ postal_code          : chr  "10437.0" "13187.0" "10439.0" "10245" ...
##  $ country_code         : chr  "DE" "DE" "DE" "DE" ...
##  $ country              : chr  "Germany" "Germany" "Germany" "Germany" ...
##  $ latitude             : num  52.5 52.6 52.5 52.5 52.5 ...
##  $ longitude            : num  13.4 13.4 13.4 13.5 13.4 ...
##  $ is_exact_location    : chr  "t" "t" "t" "t" ...
##  $ property_type        : chr  "Apartment" "Apartment" "Apartment" "Apartment" ...
##  $ room_type            : chr  "Private room" "Entire home/apt" "Entire home/apt" "Private room" ...
##  $ accomodates          : chr  "2" "2" "3" "2" ...
##  $ bathrooms            : chr  "1.0" "1.0" "1.0" "1.0" ...
##  $ bedrooms             : chr  "1.0" "2.0" "1.0" "1.0" ...
##  $ beds                 : chr  "1.0" "2.0" "2.0" "1.0" ...
##  $ square_feet          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ guests_included      : chr  "1" "2" "1" "1" ...
##  $ min_nights           : chr  "2" "7" "1" "2" ...
##  $ reviews              : int  6 6 2 4 0 10 5 14 13 2 ...
##  $ first_review         : chr  "2017-07-07" "2015-05-26" "2016-04-19" "2018-07-31" ...
##  $ last_review          : chr  "2017-08-08" "2019-04-30" "2016-07-04" "2018-08-12" ...
##  $ overall_rating       : num  100 90 100 100 NA 99 97 94 93 100 ...
##  $ accuracy_rating      : num  10 9 10 10 NA 10 10 9 9 10 ...
##  $ cleanliness_rating   : num  10 9 10 10 NA 9 10 9 10 9 ...
##  $ checkin_rating       : num  10 10 10 10 NA 10 10 10 9 10 ...
##  $ communication_rating : num  10 10 10 10 NA 10 10 10 9 10 ...
##  $ location_rating      : num  9 9 10 10 NA 10 10 10 10 9 ...
##  $ value_rating         : num  10 10 10 9 NA 10 10 10 9 10 ...
##  $ instant_bookable     : chr  "t" "f" "f" "f" ...
##  $ business_travel_ready: chr  "f" "f" "f" "f" ...
##  $ price                : num  26 41 50 50 55 39 94 73 100 50 ...
# Summary statistics
summary(train_data)
##    listing_id       listing_name          host_id           host_name        
##  Min.   :   22415   Length:15692       Min.   :    11015   Length:15692      
##  1st Qu.: 9052831   Class :character   1st Qu.:  9274528   Class :character  
##  Median :18650372   Mode  :character   Median : 33581823   Mode  :character  
##  Mean   :17800618                      Mean   : 61190420                     
##  3rd Qu.:26113085                      3rd Qu.: 93256775                     
##  Max.   :34674496                      Max.   :260969848                     
##  NA's   :1                                                                   
##   host_since        host_response_time host_response_rate is_superhost      
##  Length:15692       Length:15692       Length:15692       Length:15692      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  neighbourhood      neighborhood_group     city           postal_code       
##  Length:15692       Length:15692       Length:15692       Length:15692      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  country_code         country             latitude       longitude    
##  Length:15692       Length:15692       Min.   :52.37   Min.   :13.12  
##  Class :character   Class :character   1st Qu.:52.49   1st Qu.:13.38  
##  Mode  :character   Mode  :character   Median :52.51   Median :13.42  
##                                        Mean   :52.51   Mean   :13.41  
##                                        3rd Qu.:52.53   3rd Qu.:13.44  
##                                        Max.   :52.64   Max.   :13.71  
##                                                                       
##  is_exact_location  property_type       room_type         accomodates       
##  Length:15692       Length:15692       Length:15692       Length:15692      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   bathrooms           bedrooms             beds            square_feet    
##  Length:15692       Length:15692       Length:15692       Min.   :   0.0  
##  Class :character   Class :character   Class :character   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Mode  :character   Median : 440.0  
##                                                           Mean   : 445.9  
##                                                           3rd Qu.: 700.0  
##                                                           Max.   :1912.0  
##                                                           NA's   :15389   
##  guests_included     min_nights           reviews       first_review      
##  Length:15692       Length:15692       Min.   :  0.00   Length:15692      
##  Class :character   Class :character   1st Qu.:  1.00   Class :character  
##  Mode  :character   Mode  :character   Median :  5.00   Mode  :character  
##                                        Mean   : 19.45                     
##                                        3rd Qu.: 17.00                     
##                                        Max.   :424.00                     
##                                                                           
##  last_review        overall_rating   accuracy_rating  cleanliness_rating
##  Length:15692       Min.   : 20.00   Min.   : 2.000   Min.   : 2.000    
##  Class :character   1st Qu.: 93.00   1st Qu.:10.000   1st Qu.: 9.000    
##  Mode  :character   Median : 97.00   Median :10.000   Median :10.000    
##                     Mean   : 94.72   Mean   : 9.717   Mean   : 9.328    
##                     3rd Qu.:100.00   3rd Qu.:10.000   3rd Qu.:10.000    
##                     Max.   :100.00   Max.   :10.000   Max.   :10.000    
##                     NA's   :2962     NA's   :2971     NA's   :2970      
##  checkin_rating   communication_rating location_rating   value_rating   
##  Min.   : 2.000   Min.   : 2.000       Min.   : 2.000   Min.   : 2.000  
##  1st Qu.:10.000   1st Qu.:10.000       1st Qu.: 9.000   1st Qu.: 9.000  
##  Median :10.000   Median :10.000       Median :10.000   Median :10.000  
##  Mean   : 9.769   Mean   : 9.779       Mean   : 9.557   Mean   : 9.458  
##  3rd Qu.:10.000   3rd Qu.:10.000       3rd Qu.:10.000   3rd Qu.:10.000  
##  Max.   :10.000   Max.   :10.000       Max.   :10.000   Max.   :10.000  
##  NA's   :2973     NA's   :2970         NA's   :2971     NA's   :2972    
##  instant_bookable   business_travel_ready     price       
##  Length:15692       Length:15692          Min.   :  8.00  
##  Class :character   Class :character      1st Qu.: 32.00  
##  Mode  :character   Mode  :character      Median : 49.00  
##                                           Mean   : 60.34  
##                                           3rd Qu.: 70.00  
##                                           Max.   :900.00  
##                                           NA's   :9
# Use skimr for a more comprehensive summary
skim(train_data)
Data summary
Name train_data
Number of rows 15692
Number of columns 39
_______________________
Column type frequency:
character 25
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_name 0 1 0 1 54 2 0
host_name 0 1 0 34 22 11509 0
host_since 0 1 0 10 21 3030 0
host_response_time 0 1 0 18 7075 5 0
host_response_rate 0 1 0 4 7075 35 0
is_superhost 0 1 0 1 23 3 0
neighbourhood 0 1 1 21 0 63 0
neighborhood_group 0 1 5 24 0 12 0
city 0 1 0 6 1 3 0
postal_code 0 1 0 7 229 186 0
country_code 0 1 2 2 0 1 0
country 0 1 7 7 0 1 0
is_exact_location 0 1 1 1 0 2 0
property_type 0 1 1 18 0 16 0
room_type 0 1 11 15 0 3 0
accomodates 0 1 1 2 0 13 0
bathrooms 0 1 0 3 14 9 0
bedrooms 0 1 0 3 5 8 0
beds 0 1 0 4 8 13 0
guests_included 0 1 1 1 0 8 0
min_nights 0 1 1 3 0 25 0
first_review 0 1 0 10 2705 2244 0
last_review 0 1 0 10 2706 1451 0
instant_bookable 0 1 1 1 0 2 0
business_travel_ready 0 1 1 1 0 1 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
listing_id 1 1.00 17800618.34 9952544.25 22415.00 9052831.00 18650372.00 26113085.00 34674496.00 ▇▇▇▇▇
host_id 0 1.00 61190419.66 67394205.32 11015.00 9274527.75 33581823.00 93256774.75 260969848.00 ▇▂▁▁▁
latitude 0 1.00 52.51 0.03 52.37 52.49 52.51 52.53 52.64 ▁▂▇▃▁
longitude 0 1.00 13.41 0.06 13.12 13.38 13.42 13.44 13.71 ▁▂▇▁▁
square_feet 15389 0.02 445.90 414.82 0.00 0.00 440.00 700.00 1912.00 ▇▇▂▁▁
reviews 0 1.00 19.45 39.48 0.00 1.00 5.00 17.00 424.00 ▇▁▁▁▁
overall_rating 2962 0.81 94.72 7.07 20.00 93.00 97.00 100.00 100.00 ▁▁▁▁▇
accuracy_rating 2971 0.81 9.72 0.67 2.00 10.00 10.00 10.00 10.00 ▁▁▁▁▇
cleanliness_rating 2970 0.81 9.33 1.02 2.00 9.00 10.00 10.00 10.00 ▁▁▁▁▇
checkin_rating 2973 0.81 9.77 0.62 2.00 10.00 10.00 10.00 10.00 ▁▁▁▁▇
communication_rating 2970 0.81 9.78 0.62 2.00 10.00 10.00 10.00 10.00 ▁▁▁▁▇
location_rating 2971 0.81 9.56 0.73 2.00 9.00 10.00 10.00 10.00 ▁▁▁▁▇
value_rating 2972 0.81 9.46 0.79 2.00 9.00 10.00 10.00 10.00 ▁▁▁▁▇
price 9 1.00 60.34 48.83 8.00 32.00 49.00 70.00 900.00 ▇▁▁▁▁

Initial Data Exploration

Missing Values

Let’s examine missing values in the dataset.

# Count missing values in each column
missing_train <- colSums(is.na(train_data))
missing_test <- colSums(is.na(test_data))

# Display columns with missing values
missing_train[missing_train > 0]
##           listing_id          square_feet       overall_rating 
##                    1                15389                 2962 
##      accuracy_rating   cleanliness_rating       checkin_rating 
##                 2971                 2970                 2973 
## communication_rating      location_rating         value_rating 
##                 2970                 2971                 2972 
##                price 
##                    9
missing_test[missing_test > 0]
##           listing_id              host_id          square_feet 
##                    1                    1                 7694 
##       overall_rating      accuracy_rating   cleanliness_rating 
##                 1515                 1519                 1518 
##       checkin_rating communication_rating      location_rating 
##                 1522                 1520                 1522 
##         value_rating 
##                 1521
# Visualize missing values
train_data %>%
  summarise(across(everything(), ~sum(is.na(.))/n())) %>%
  gather() %>%
  ggplot(aes(x = reorder(key, value), y = value)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Proportion of Missing Values in Train Dataset",
       x = "",
       y = "Proportion Missing") +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent)

Results: Several columns contain missing values, with the rating-related fields having the highest proportion of missing data (approximately 20%). This is expected since not all listings have received reviews or ratings. The review_scores columns (accuracy, cleanliness, checkin, communication, location, and value) show consistent missing patterns, suggesting that when a listing has no reviews, all these rating fields are missing. We’ll need to implement appropriate strategies to handle these missing values in our modeling approach.

Price Distribution Analysis

Let’s examine the price distribution.

# Basic statistics
price_stats <- summary(train_data$price)
print(price_stats)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    8.00   32.00   49.00   60.34   70.00  900.00       9
# Price distribution
ggplot(train_data, aes(x = price)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  labs(title = "Distribution of Airbnb Prices in Berlin",
       x = "Price (Euro)",
       y = "Count") +
  theme_minimal()

# Price distribution with log transformation (for skewed data)
ggplot(train_data, aes(x = price)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  scale_x_log10() +
  labs(title = "Distribution of Airbnb Prices in Berlin (Log Scale)",
       x = "Price (Euro) - Log Scale",
       y = "Count") +
  theme_minimal()

# Price boxplot
ggplot(train_data, aes(y = price)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  labs(title = "Boxplot of Airbnb Prices in Berlin",
       y = "Price (Euro)") +
  theme_minimal()

Results: The price distribution is right-skewed, with a median of approximately €49 and a mean of around €60. The majority of listings are priced between €25 and €100 per night, with some outliers reaching as high as €900. The log-transformed histogram shows that the price distribution becomes more normal when viewed on a logarithmic scale, suggesting that percentage changes in price might be more meaningful than absolute changes. The boxplot clearly shows many outliers on the high end of the price spectrum, which we’ll need to address in our feature engineering.

Geographic Analysis

If the dataset includes location information, let’s visualize the geographic distribution.

# Check if we have latitude and longitude
if(all(c("latitude", "longitude") %in% names(train_data))) {
  # Create a leaflet map
  leaflet_map <- leaflet(train_data) %>%
    addTiles() %>%
    addCircleMarkers(
      lng = ~longitude,
      lat = ~latitude,
      radius = 2,
      color = "blue",
      fillOpacity = 0.5,
      popup = ~paste("Price:", price, "<br>",
                    "Room Type:", room_type)
    ) %>%
    addControl(html = "<b>Airbnb Listings in Berlin</b>", position = "topright")
  
  # Display the map
  leaflet_map
  
  # Plot price heatmap by location
  # Create bins for price
  train_data$price_bin <- cut(train_data$price, 
                             breaks = c(0, 50, 100, 150, 200, Inf),
                             labels = c("< 50", "50-100", "100-150", "150-200", "> 200"))
  
  # Color palette
  price_pal <- colorFactor(
    palette = c("green", "blue", "purple", "orange", "red"),
    domain = train_data$price_bin
  )
  
  # Create a leaflet map with price colors
  price_map <- leaflet(train_data) %>%
    addTiles() %>%
    addCircleMarkers(
      lng = ~longitude,
      lat = ~latitude,
      radius = 2,
      color = ~price_pal(price_bin),
      fillOpacity = 0.7,
      popup = ~paste("Price:", price, "<br>",
                    "Room Type:", room_type)
    ) %>%
    addLegend("bottomright", 
              pal = price_pal, 
              values = ~price_bin,
              title = "Price (Euro)",
              opacity = 1)
  
  # Display the map
  price_map
}

Results: The geographic visualizations reveal interesting spatial patterns. Airbnb listings are concentrated in central Berlin, particularly in popular districts like Mitte, Friedrichshain-Kreuzberg, and Neukölln. The price heatmap shows higher-priced listings (red and orange) tend to be located in the central areas and near tourist attractions, while more affordable options (green and blue) are often found in the surrounding neighborhoods. This spatial distribution suggests location is likely to be a significant factor in our demand model.

Neighborhood Analysis

Let’s analyze price variations by neighborhood.

# Check if neighborhood information is available
if("neighbourhood" %in% names(train_data)) {
  # Count listings by neighborhood
  neighborhood_counts <- train_data %>%
    count(neighbourhood) %>%
    arrange(desc(n))
  
  # Top 15 neighborhoods by listing count
  top_neighborhoods <- head(neighborhood_counts, 15)
  kable(top_neighborhoods, caption = "Top 15 Neighborhoods by Number of Listings")
  
  # Visualize neighborhood distribution
  ggplot(top_neighborhoods, aes(x = reorder(neighbourhood, n), y = n)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(title = "Top 15 Neighborhoods by Number of Listings",
         x = "Neighborhood",
         y = "Number of Listings") +
    theme_minimal()
  
  # Calculate average price by neighborhood
  avg_price_by_neighborhood <- train_data %>%
    group_by(neighbourhood) %>%
    summarise(
      avg_price = mean(price, na.rm = TRUE),
      median_price = median(price, na.rm = TRUE),
      count = n()
    ) %>%
    filter(count >= 10) %>%  # Only include neighborhoods with at least 10 listings
    arrange(desc(avg_price))
  
  # Top 15 neighborhoods by average price
  top_price_neighborhoods <- head(avg_price_by_neighborhood, 15)
  kable(top_price_neighborhoods, caption = "Top 15 Neighborhoods by Average Price")
  
  # Visualize average price by neighborhood
  ggplot(top_price_neighborhoods, 
         aes(x = reorder(neighbourhood, avg_price), y = avg_price)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    geom_text(aes(label = round(avg_price, 0)), hjust = -0.1, size = 3) +
    coord_flip() +
    labs(title = "Top 15 Neighborhoods by Average Price",
         x = "Neighborhood",
         y = "Average Price (Euro)") +
    theme_minimal()
  
  # Price distribution by neighborhood (box plots)
  # Select top 10 neighborhoods by listing count for readability
  top10_neighborhoods <- neighborhood_counts$neighbourhood[1:10]
  
  train_data %>%
    filter(neighbourhood %in% top10_neighborhoods) %>%
    ggplot(aes(x = reorder(neighbourhood, price, FUN = median), y = price)) +
    geom_boxplot(fill = "steelblue", alpha = 0.7) +
    coord_flip() +
    labs(title = "Price Distribution by Neighborhood",
         x = "Neighborhood",
         y = "Price (Euro)") +
    theme_minimal()
}

Property and Room Type Analysis

Let’s analyze the property types and room types in our dataset.

# Check if property_type exists
if("property_type" %in% names(train_data)) {
  # Distribution of property types
  property_counts <- train_data %>%
    count(property_type) %>%
    arrange(desc(n))
  
  # Show top property types
  kable(head(property_counts, 10), caption = "Top 10 Property Types")
  
  # Visualize property type distribution
  ggplot(head(property_counts, 10), aes(x = reorder(property_type, n), y = n)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(title = "Top 10 Property Types",
         x = "Property Type",
         y = "Count") +
    theme_minimal()
}

# Check if room_type exists
if("room_type" %in% names(train_data)) {
  # Distribution of room types
  room_counts <- train_data %>%
    count(room_type) %>%
    arrange(desc(n))
  
  # Show room types
  kable(room_counts, caption = "Distribution of Room Types")
  
  # Visualize room type distribution
  ggplot(room_counts, aes(x = reorder(room_type, n), y = n)) +
    geom_bar(stat = "identity", fill = "steelblue") +
    coord_flip() +
    labs(title = "Distribution of Room Types",
         x = "Room Type",
         y = "Count") +
    theme_minimal()
  
  # Analyze average price by room type
  avg_price_by_room <- train_data %>%
    group_by(room_type) %>%
    summarise(
      avg_price = mean(price, na.rm = TRUE),
      median_price = median(price, na.rm = TRUE),
      count = n()
    ) %>%
    arrange(desc(avg_price))
  
  kable(avg_price_by_room, caption = "Average Price by Room Type")
  
  # Boxplot of price by room type
  ggplot(train_data, aes(x = room_type, y = price)) +
    geom_boxplot(fill = "steelblue", alpha = 0.7) +
    labs(title = "Price Distribution by Room Type",
         x = "Room Type",
         y = "Price (Euro)") +
    theme_minimal()
}

Reviews and Ratings Analysis

Let’s analyze review counts and ratings to understand their relationship with property demand.

# Summary of review counts
if("reviews" %in% names(train_data)) {
  review_summary <- summary(train_data$reviews)
  print(review_summary)
  
  # Distribution of review counts
  ggplot(train_data, aes(x = reviews)) +
    geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
    labs(title = "Distribution of Review Counts",
         x = "Number of Reviews",
         y = "Count") +
    theme_minimal()
  
  # Log-transformed distribution for skewed data
  ggplot(train_data %>% filter(reviews > 0), aes(x = reviews)) +
    geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
    scale_x_log10() +
    labs(title = "Distribution of Review Counts (Log Scale)",
         x = "Number of Reviews (Log Scale)",
         y = "Count") +
    theme_minimal()
}
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    5.00   19.45   17.00  424.00

# Check for rating columns
rating_columns <- names(train_data)[grepl("rating", names(train_data), ignore.case = TRUE)]

if(length(rating_columns) > 0) {
  # Distribution of ratings
  for(col in rating_columns) {
    # Summary statistics
    rating_summary <- summary(train_data[[col]])
    cat(paste("\nSummary of", col, ":\n"))
    print(rating_summary)
    
    # Plot distribution
    p <- ggplot(train_data, aes(x = !!sym(col))) +
      geom_histogram(bins = 20, fill = "steelblue", alpha = 0.7) +
      labs(title = paste("Distribution of", tools::toTitleCase(col)),
           x = tools::toTitleCase(col),
           y = "Count") +
      theme_minimal()
    
    print(p)
    
    # If price is available, check correlation with ratings
    if("price" %in% names(train_data)) {
      # Scatter plot
      p2 <- ggplot(train_data, aes(x = !!sym(col), y = price)) +
        geom_point(alpha = 0.3, color = "steelblue") +
        geom_smooth(method = "lm", color = "red") +
        labs(title = paste("Relationship Between", tools::toTitleCase(col), "and Price"),
             x = tools::toTitleCase(col),
             y = "Price (Euro)") +
        theme_minimal()
      
      print(p2)
    }
  }
}
## 
## Summary of overall_rating :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   20.00   93.00   97.00   94.72  100.00  100.00    2962

## 
## Summary of accuracy_rating :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000  10.000  10.000   9.717  10.000  10.000    2971

## 
## Summary of cleanliness_rating :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000   9.000  10.000   9.328  10.000  10.000    2970

## 
## Summary of checkin_rating :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000  10.000  10.000   9.769  10.000  10.000    2973

## 
## Summary of communication_rating :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000  10.000  10.000   9.779  10.000  10.000    2970

## 
## Summary of location_rating :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000   9.000  10.000   9.557  10.000  10.000    2971

## 
## Summary of value_rating :
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.000   9.000  10.000   9.458  10.000  10.000    2972

Correlation Analysis

Let’s examine correlations between numeric variables to understand relationships.

# Select numeric columns
numeric_columns <- train_data %>%
  select_if(is.numeric) %>%
  # Remove ID columns and other non-meaningful numeric columns
  select(-matches("id|_id$|index|^X$"), -one_of("latitude", "longitude"))

# If there are numeric columns, create correlation matrix
if(ncol(numeric_columns) > 1) {
  # Calculate correlation matrix
  cor_matrix <- cor(numeric_columns, use = "pairwise.complete.obs")
  
  # Plot correlation matrix
  corrplot(cor_matrix, method = "circle", type = "upper", 
           tl.col = "black", tl.srt = 45, tl.cex = 0.7,
           title = "Correlation Matrix of Numeric Variables")
  
  # Display top correlations with price (if available)
  if("price" %in% names(numeric_columns)) {
    price_cors <- cor_matrix["price", ]
    price_cors <- price_cors[order(abs(price_cors), decreasing = TRUE)]
    price_cors <- price_cors[price_cors != 1]  # Remove self-correlation
    
    cat("\nTop correlations with price:\n")
    print(head(price_cors, 10))
  }
}

## 
## Top correlations with price:
##          square_feet              reviews      location_rating 
##           0.50893450           0.07823090           0.07605773 
##   cleanliness_rating         value_rating       overall_rating 
##           0.07263419          -0.05762347           0.04353725 
## communication_rating      accuracy_rating       checkin_rating 
##           0.02888236           0.02277033           0.01420024

Creating a Demand Proxy

Since our dataset may not have a direct demand measure, we need to create a proxy based on available data. This proxy will be our target variable for predicting property popularity.

# Check which columns we have available for demand proxy creation
potential_demand_columns <- c("reviews", "price")
available_columns <- potential_demand_columns[potential_demand_columns %in% names(train_data)]

cat("Available columns for demand proxy creation:", paste(available_columns, collapse = ", "), "\n")
## Available columns for demand proxy creation: reviews, price
# Create a simple demand proxy based on available data
train_data_with_proxy <- train_data

# If review data is available
if("reviews" %in% names(train_data)) {
  # Normalize reviews (higher = more demand)
  max_reviews <- max(train_data$reviews, na.rm = TRUE)
  train_data_with_proxy$review_score <- train_data$reviews / max_reviews
  cat("Created review_score from reviews column\n")
  
  # Create a simple availability score based on reviews
  # Ensure labels match the number of intervals (need n-1 labels for n breakpoints)
  review_quantiles <- quantile(train_data$reviews, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
  
  # For debugging, print the quantiles
  cat("Review quantiles:", paste(round(review_quantiles, 2), collapse = ", "), "\n")
  
  # Create an availability score with 4 levels (0, 0.25, 0.5, 0.75)
  # Use 5 breaks to create 4 intervals, then assign 4 labels
  train_data_with_proxy$availability_score <- as.numeric(as.character(
    cut(train_data$reviews, 
        breaks = review_quantiles,
        labels = c("0", "0.25", "0.5", "0.75"),
        include.lowest = TRUE)
  ))
  
  cat("Created availability_score as a proxy from reviews column\n")
  
  # Combined demand score (average of review and availability scores)
  train_data_with_proxy$demand_proxy <- (train_data_with_proxy$review_score + 
                                       train_data_with_proxy$availability_score) / 2
  cat("Created demand_proxy from review_score and availability_score\n")
  
} else if("price" %in% names(train_data)) {
  # Fallback to price-based proxy
  max_price <- max(train_data$price, na.rm = TRUE)
  train_data_with_proxy$demand_proxy <- train_data$price / max_price
  cat("Created demand_proxy from price (fallback method)\n")
} else {
  cat("No suitable demand proxy could be created from available data\n")
}
## Created review_score from reviews column
## Review quantiles: 0, 1, 5, 17, 424 
## Created availability_score as a proxy from reviews column
## Created demand_proxy from review_score and availability_score
# Visualize the demand proxy if created
if("demand_proxy" %in% names(train_data_with_proxy)) {
  # Distribution of demand proxy
  ggplot(train_data_with_proxy, aes(x = demand_proxy)) +
    geom_histogram(bins = 30, fill = "steelblue", alpha = 0.7) +
    labs(title = "Distribution of Demand Proxy",
         x = "Demand Proxy (0-1 scale)",
         y = "Count") +
    theme_minimal()
  
  # If we have neighborhood data, show average demand by neighborhood
  if("neighbourhood" %in% names(train_data)) {
    # Calculate average demand by neighborhood
    avg_demand_by_neighborhood <- train_data_with_proxy %>%
      group_by(neighbourhood) %>%
      summarise(
        avg_demand = mean(demand_proxy, na.rm = TRUE),
        count = n()
      ) %>%
      filter(count >= 10) %>%  # Only include neighborhoods with at least 10 listings
      arrange(desc(avg_demand))
    
    # Top 15 neighborhoods by average demand
    top_demand_neighborhoods <- head(avg_demand_by_neighborhood, 15)
    kable(top_demand_neighborhoods, caption = "Top 15 Neighborhoods by Average Demand")
    
    # Visualize average demand by neighborhood
    ggplot(top_demand_neighborhoods, 
           aes(x = reorder(neighbourhood, avg_demand), y = avg_demand)) +
      geom_bar(stat = "identity", fill = "steelblue") +
      coord_flip() +
      labs(title = "Top 15 Neighborhoods by Average Demand",
           x = "Neighborhood",
           y = "Average Demand Proxy") +
      theme_minimal()
  }
  
  # Save the data with demand proxy to processed folder
  processed_data_path <- "../data/processed/"
  dir.create(processed_data_path, recursive = TRUE, showWarnings = FALSE)
  
  # Save train data with demand proxy
  write.csv(train_data_with_proxy, paste0(processed_data_path, "train_berlin_clean.csv"), row.names = FALSE)
  
  # Also save test data (without demand proxy as that's what we're predicting)
  write.csv(test_data, paste0(processed_data_path, "test_berlin_clean.csv"), row.names = FALSE)
  
  cat("Saved processed data with demand_proxy to", processed_data_path, "\n")
}
## Saved processed data with demand_proxy to ../data/processed/